A pixel-based approach to classify informal Settlements in the Dominican Republic’s capital using PlanetScope data.

Author: Francina Cabrera Fermin

CASA UCL 2021

This document provides instructions to extract informal settlements using spectral and texture features from 3 metres resolution satellite scenes from PlanetScope. This project creates several Random Forest Classification model based on the spectral and gray level co-occurrence matrix (variance) values of satellite image pixels.

Description of the data:

  1. Two PlanetScope Analytic Ortho scenes with 3 metres pixel resolution, along with their Unusable Data Masks (UDM). Taken on the 23/02/2021 23:20:17. Downloaded from: https://www.Planet.com
  2. Visually collected labelled training and assessment sample using VHR Google maps imagery.
  3. Distrito Nacional’s boundary (.shp) collected in the 2010 Population Census by National Statistics Office of the Dominican Republic. Downloaded from: https://www.one.gob.do/informaciones-cartograficas/shapefiles
  4. Distrito Nacional’s circumscription 3 (.shp) collected in the 2010 Population Census by National Statistics Office of the Dominican Republic. Downloaded from: https://www.one.gob.do/informaciones-cartograficas/shapefiles
  5. Dominican Republic’s provinces (.shp) collected in the 2010 Population Census by National Statistics Office of the Dominican Republic. Downloaded from: https://www.one.gob.do/informaciones-cartograficas/shapefiles

The data is stored in an online github repository: https://github.com/francicabrera/DI_Francina2021.git

This code follows the process from:

https://pages.cms.hu-berlin.de/EOL/gcg_eo/index.html

https://andrewmaclachlan.github.io/CASA0005repo/advanced-raster-analysis.html

1. Libraries

Install packages and load the required libraries

## Install packages and load the required libraries

library(sf) # to read layers 
library(tmap) # visualisation
library(googledrive) # to load data from google drive 
library(here) # file referencing 
library(raster) # to work with rasters
library(sp)
library(GGally) # to combine maps in a single plot
library(tidyverse) # data management
library(ggplot2) # to make maps and graphics
library(reshape2) # data management
library(caret) # create data partition
library(e1071) # to optimise the mtry parameter
library(randomForest) # to build the classification model
library(glcm) # to create GLCM model
library(RStoolbox) # to perform PCA
library(ggsn) # visualisation
library(cowplot) # to arrange inset maps

2. Data handling

Load and prepare the data

2.1. Administrative boundaries

# Dominican Republic Provinces (for visualisation only)
provinces <- st_read(here("data","geo", "PROVCenso2010.shp")) %>% 
  # Project to EPSG:32619. This is the projected coordinate system for the Dominican Republic.
  st_transform(.,32619)
## Reading layer `PROVCenso2010' from data source `/Users/francinacabrera/Documents/CASA/DI_Francina2021/Slum_detection/data/geo/PROVCenso2010.shp' using driver `ESRI Shapefile'
## Simple feature collection with 32 features and 4 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 182215.8 ymin: 1933512 xmax: 571429.3 ymax: 2205216
## projected CRS:  WGS 84 / UTM zone 19N

# Distrito Nacional's boundary (for visualisation only)
dn_boundary <- st_read(here("data","geo","DN_boundary.shp")) %>%
  # Project to EPSG:32619. This is the projected coordinate system for the Dominican Republic.
  st_transform(.,32619)
## Reading layer `DN_boundary' from data source `/Users/francinacabrera/Documents/CASA/DI_Francina2021/Slum_detection/data/geo/DN_boundary.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 4 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 394514.9 ymin: 2037241 xmax: 407670.6 ymax: 2051052
## projected CRS:  WGS 84 / UTM zone 19N

# Distrito Nacional's circumscriptions (for visualisation only)
dn_circ <- st_read(here("data","geo","DN_circumscriptions.shp")) %>%
  # Project to EPSG:32619. This is the projected coordinate system for the Dominican Republic.
  st_transform(.,32619) 
## Reading layer `DN_circumscriptions' from data source `/Users/francinacabrera/Documents/CASA/DI_Francina2021/Slum_detection/data/geo/DN_circumscriptions.shp' using driver `ESRI Shapefile'
## Simple feature collection with 3 features and 7 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 394514.9 ymin: 2037241 xmax: 407670.6 ymax: 2051052
## projected CRS:  WGS 84 / UTM zone 19N

# Circumscription 3
c3_boundary <- st_read(here("data","geo","Circ3.shp")) %>% 
  # Project to EPSG:32619. This is the projected coordinate system for the Dominican Republic.
  st_transform(.,32619)
## Reading layer `Circ3' from data source `/Users/francinacabrera/Documents/CASA/DI_Francina2021/Slum_detection/data/geo/Circ3.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 7 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 403336.9 ymin: 2043038 xmax: 407670.6 ymax: 2047337
## projected CRS:  WGS 84 / UTM zone 19N

2.2. PlanetScope scenes.

# The images will be downloaded from Google Drive.
# First scene:
# Create a temporary file that will be substituted with the downloaded raster.
temp49 <- tempfile(fileext = ".tif")
# Download the raster with the file's id from Google Drive.
dl49 <- drive_download(as_id("1ftQXEszRYiHfiCDkm1kJ3parha85569w"),
                     path = temp49, overwrite = TRUE)
# Create the raster stack
raster49 <- stack(temp49)
# Check the coordinate system is WGS 84 / UTM zone 19N,
# if not use: %>% projectRaster(., crs=32619)
crs(raster49) 
## CRS arguments:
##  +proj=utm +zone=19 +datum=WGS84 +units=m +no_defs

# Second scene:
# Create a temporary file that will be substituted with the downloaded raster.
temp50 <- tempfile(fileext = ".tif")
# Download the raster with the file's id from Google Drive.
dl50 <- drive_download(as_id("1But-SQaZkdazRBNMUKXjocLyiD7GQkJL"),
                       path = temp50, overwrite = TRUE)
# Create the raster stack
raster50 <- stack(temp50)
# Check the coordinate system is WGS 84 / UTM zone 19N,
# if not use: %>% projectRaster(., crs=32619)
crs(raster50) 
## CRS arguments:
##  +proj=utm +zone=19 +datum=WGS84 +units=m +no_defs
# Quick look at the images by the different bands
plot(raster49, main="PlanetScope Distrito Nacional Scene 1")

plot(raster50, main="PlanetScope Distrito Nacional Scene 2")

Cloud Assessment

PlanetScope provides Usable Data Masks (UDM) to assess the data quality of their images. See more here: https://developers.planet.com/docs/data/udm-2/ Let’s verify the appearance of pixels covered with clouds or shadow.

# Create a temporary file that will be substituted with the downloaded UDM raster.
temp49_mask <- tempfile(fileext = ".tif")
# Download the UDM raster with the file's id from Google Drive.
dl49_mask <- drive_download(as_id("18r619B2o98q1JHxtkSyPFArvVKqKcpev"),
                            path = temp49_mask, overwrite = TRUE)
# Create the raster stack
raster49_mask <- stack(temp49_mask)
# Check the coordinate system is WGS 84 / UTM zone 19N,
# if not use: %>% projectRaster(., crs=32619)
crs(raster49_mask)
## CRS arguments:
##  +proj=utm +zone=19 +datum=WGS84 +units=m +no_defs
Find the most frequent values using freq().

Consider the meaning of 0 as “FALSE” or “NO”, 1 equals “TRUE” or “YES”. Or, 0: no cloud, 1: cloud; 0: no shadow, 1: shadow. See more here: https://pages.cms.hu-berlin.de/EOL/gcg_eo/02_data_quality.html

freq(raster49_mask$cloud) 
##      value    count
## [1,]     0 30305400

Repeat the same process with the UDM raster from the second scene.

# Create a temporary file that will be substituted with the downloaded raster.
temp50_mask <- tempfile(fileext = ".tif")
# Download the UDM raster with the file's id from Google Drive.
dl50_mask <- drive_download(as_id("18HnyLNzhswsV2S3b1EokS5OHXKsesZ99"),
                            path = temp50_mask, overwrite = TRUE)
# Create the raster stack
raster50_mask <- stack(temp50_mask)
# Check the coordinate system is WGS 84 / UTM zone 19N,
# if not use: %>% projectRaster(., crs=32619)
crs(raster50_mask)
## CRS arguments:
##  +proj=utm +zone=19 +datum=WGS84 +units=m +no_defs
# Find the most frequent values using freq().
freq(raster50_mask$cloud) 
##      value    count
## [1,]     0 30277741

As both raster show no signal of clouds, the masks wont be used. Therefore, we will remain with the original scenes.

Mosaic

Let’s create a mosaic out of the two scenes

See more here: http://www.qgistutorials.com/en/docs/3/raster_mosaicing_and_clipping.html

# Merge the two original scenes to create a mosaic using the function mosaic()
raster_mosaic <- mosaic(raster49,
                        raster50, 
                        fun=mean, 
                        tolerance=0.05, 
                        filename="raster_mosaic", 
                        overwrite=TRUE)
# Quick look at the mosaic
plot(raster_mosaic, main="Mosaic of PlanetScope scenes")


# Crop the mosaic to the Distrito Nacional's boundary with crop()
# This raster will be used for visualisation only.
mosaic_DN <- crop(raster_mosaic, 
                  dn_boundary, 
                  filename="mosaicDN_crop", 
                  overwrite=TRUE) %>%
  mask(., dn_boundary)

# Crop the mosaic to the Circumscription 3 (C3)'s boundary with crop()
mosaic_C3 <- crop(raster_mosaic, 
                  c3_boundary, 
                  filename="mosaicC3_crop", 
                  overwrite=TRUE) %>% 
  mask(., c3_boundary)

Information on the Circumscription 3 (C3)’s raster stack.

extent(mosaic_C3) # extent
## class      : Extent 
## xmin       : 403338 
## xmax       : 407670 
## ymin       : 2043039 
## ymax       : 2047338
ncell(mosaic_C3) # number of cells
## [1] 2069252
dim(mosaic_C3) # number of rows, columns, layers
## [1] 1433 1444    4
nlayers(mosaic_C3) # number of layers
## [1] 4
res(mosaic_C3) # xres, yres
## [1] 3 3
Name the Bands based on where they sample the electromagnetic spectrum.
# Mosaic
names(raster_mosaic) <- c('blue', 'green', 'red', 'NIR') 
# DN's mosaic
names(mosaic_DN) <- c('blue', 'green', 'red', 'NIR') 
# C3' mosaic
names(mosaic_C3) <- c('blue', 'green', 'red', 'NIR')
Plot the data in true colours and false composite.
# C3
# true colour composite
rC3_rgb <- stack(mosaic_C3$red, mosaic_C3$green, mosaic_C3$blue) %>% 
  plotRGB(.,axes=TRUE, stretch="lin")

# false colour composite
rC3_false <- stack(mosaic_C3$NIR, mosaic_C3$red, mosaic_C3$green) %>% 
  plotRGB(.,axes=TRUE, stretch="lin")

2.3. Training and Test samples

The training data was collected digitally using a very high resolution (VHR) imagery as a reference: the Google maps terrain in QGIS.

The location of the informal settlements in the Distrito Nacional was obtained from: https://web.one.gob.do/publicaciones/2016/estudio-metodologia-para-la-identificacion-de-tugurios-en-el-distrito-nacional-censo-2010/

Following the classification for informal and formal settlements from: https://ieeexplore.ieee.org/document/6236225 and https://doi.org/10.1016/j.compenvurbsys.2011.11.001, the classification follows six classes:

  1. Informal settlements Type I: Slums with an irregular shape, high density of buildings, and where paved roads could be visually identified. Roofs: grey, brown, red, and white. Same building with varying colours.

  2. Informal settlements Type II: Slums with an irregular shape, high density of buildings, and undefined roads. Roofs: grey, brown, red, and white. Same building with varying colours.

  3. Informal settlements Type III: Slums with an irregular shape, high density of buildings, undefined roads, and proximity to hazardous elements (rivers or ravines). Roofs: grey, brown, red, and white. Same building with varying colours. Hazardous arrangement.

  4. Formal settlement: Settlements with regular shape, regular density of buildings and paved roads.

  5. Non-settlements: Bare ground, grassland, forest, water, parking lots and sports courts.

  6. Roads and asphalt: Paved roads.

# Read training points.
trainC3 <- st_read(here("data", "training_data","TrainingData_C3.shp")) %>% 
  # Project to EPSG:32619.
  st_transform(.,32619)
## Reading layer `TrainingData_C3' from data source `/Users/francinacabrera/Documents/CASA/DI_Francina2021/Slum_detection/data/training_data/TrainingData_C3.shp' using driver `ESRI Shapefile'
## Simple feature collection with 11733 features and 4 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 403348 ymin: 2043067 xmax: 407656.7 ymax: 2047330
## projected CRS:  WGS 84 / UTM zone 19N
# Quick look of the feature
qtm(trainC3)

crs(trainC3) # Check crs
## CRS arguments:
##  +proj=utm +zone=19 +datum=WGS84 +units=m +no_defs
# Check how many data points per class are
trainC3 %>% group_by(classID) %>% count()
## Simple feature collection with 6 features and 2 fields
## geometry type:  MULTIPOINT
## dimension:      XY
## bbox:           xmin: 403348 ymin: 2043067 xmax: 407656.7 ymax: 2047330
## projected CRS:  WGS 84 / UTM zone 19N
## # A tibble: 6 x 3
##   classID     n                                                         geometry
## *   <dbl> <int>                                                 <MULTIPOINT [m]>
## 1       1   885 ((404114.5 2045755), (404119 2045677), (404126.3 2045593), (404…
## 2       2  3292 ((403493.6 2046463), (403495.7 2046399), (403498.3 2046478), (4…
## 3       3   627 ((403456.5 2046842), (403471.2 2046829), (403510.5 2046837), (4…
## 4       4  4444 ((403369.7 2046079), (403370.2 2046187), (403372.5 2046123), (4…
## 5       5  1071 ((403358.1 2046402), (403360.3 2046352), (403363.1 2046341), (4…
## 6       6  1414 ((403348 2046127), (403350.9 2046049), (403353.7 2045969), (403…

# Extract image values at training point locations
trainC3.sp <- raster::extract(mosaic_C3, trainC3, sp=T)

# Convert to data.frame 
trainC3.df <- as.data.frame(trainC3.sp)
Create boxplots of reflectance grouped by land cover class.

This will allow you to identify any outliers in the data.

# Melt dataframe containing point id, classID, and 6 spectral bands
spectra.df <- melt(trainC3.df, id.vars='classID', 
                   measure.vars=c('blue', 'green', 'red', 'NIR'))

# Create boxplots of spectral bands per class
ggplot(spectra.df, aes(x=variable, y=value, color=classID)) +
  geom_boxplot() +
  theme_bw()

The function that we’ll use to train the classification model expects the dependent variable to be of type factor.

# Use as.factor() for conversion of the classID column.
trainC3.df$classID <- as.factor(trainC3.df$classID) 
str(trainC3.df) #allows you to see the classes of the variables (all numeric)
## 'data.frame':    11733 obs. of  10 variables:
##  $ id        : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ classID   : Factor w/ 6 levels "1","2","3","4",..: 4 5 2 4 4 4 4 4 4 4 ...
##  $ class_name: chr  "formal_settlement" "non_settlement" "informal_settlement_type2" "formal_settlement" ...
##  $ circ_num  : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ blue      : int  547 1192 761 1955 773 1489 1405 989 1383 541 ...
##  $ green     : int  632 1396 839 2161 858 1534 1472 1137 1507 618 ...
##  $ red       : int  523 1638 833 2198 810 1457 1482 1152 1369 529 ...
##  $ NIR       : int  965 2367 1188 2655 1318 1875 1787 1750 1699 1091 ...
##  $ coords.x1 : num  404857 404829 403756 403969 403935 ...
##  $ coords.x2 : num  2045945 2045801 2046283 2046086 2046240 ...

# Include useful predictors.
trainC3_df <- select(trainC3.df, -c("id", "class_name","circ_num","coords.x1","coords.x2"))

# The classification model cannot deal with NoData (NA) values. Remove NAs from the data.frame.
sum(is.na(trainC3_df)) # check how many values are null
## [1] 0
trainC3_df <- na.omit(trainC3_df)
sum(is.na(trainC3_df))
## [1] 0

Stratified random sampling helps us to validate the map using a sufficient amount of samples for each for the four classes

See more about this here: https://stackoverflow.com/questions/20776887/stratified-splitting-the-data/30181396

# createDataPartition does a stratified random split of the data.
# Set a portion of the data aside for testing
sample <- createDataPartition(trainC3_df$classID, p = .80, list = FALSE)
train <- trainC3_df[sample,]
test <- trainC3_df[-sample,]

# Check how many data points are per class
train %>% group_by(classID) %>% count()
## # A tibble: 6 x 2
## # Groups:   classID [6]
##   classID     n
##   <fct>   <int>
## 1 1         708
## 2 2        2634
## 3 3         502
## 4 4        3556
## 5 5         857
## 6 6        1132
test %>% group_by(classID) %>% count()
## # A tibble: 6 x 2
## # Groups:   classID [6]
##   classID     n
##   <fct>   <int>
## 1 1         177
## 2 2         658
## 3 3         125
## 4 4         888
## 5 5         214
## 6 6         282

# Check the dimension of the objects
dim(train)
## [1] 9389    5
dim(test)
## [1] 2344    5

3. Data Classification

This section creates a classification model using the spectral bands of the mosaic.

This section follows the code from this source: https://pages.cms.hu-berlin.de/EOL/gcg_eo/05_machine_learning.html

Now, we’ll build two models. One with spectral data only and other fusing spectral and texture data.

3.1. Model 1: Spectral data only.

This part of the process uses the package: “e1071”. We will optimise the mtry parameter.

  1. Number of trees (ntree): it is unnecessary to tune in the ntree, instead it is recommendednto set it to a large number and compare across multiple runs of the model.

  2. Number of variables (mtry): set the number of k-folds that will run to find the optimal parameter.

Read more here: https://stats.stackexchange.com/questions/348245/do-we-have-to-tune-the-number-of-trees-in-a-random-forest

Other sources: https://www.youtube.com/watch?v=v5Bmz2eMd7M

# Define accuracy from 5-fold cross-validation as optimization measure
cv <- tune.control(cross = 5) 

# Use tune.randomForest to assess the optimal combination of ntree and mtry
RF_modelC3.tune <- tune.randomForest(classID~., 
                                     data = train, 
                                     ntree=750, 
                                     mtry=c(2:10), 
                                     importance = TRUE,
                                     tunecontrol = cv)

# Store the best model in a new object for further use
RF_modelC3 <- RF_modelC3.tune$best.model
print(RF_modelC3)
## 
## Call:
##  best.randomForest(x = classID ~ ., data = train, mtry = c(2:10),      ntree = 750, importance = TRUE, tunecontrol = cv) 
##                Type of random forest: classification
##                      Number of trees: 750
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 54.17%
## Confusion matrix:
##    1    2   3    4   5   6 class.error
## 1 20  236   9  373  15  55   0.9717514
## 2 45 1002  89 1150  88 260   0.6195900
## 3  3  155 148   58  95  43   0.7051793
## 4 53  831  18 2369  49 236   0.3338020
## 5  3  178  60  121 398  97   0.5355893
## 6 25  335  29  331  46 366   0.6766784
# Check at which number of trees the model stabilises. 
plot(RF_modelC3) 

3.1.1. Variable Importance

# Check which variables provide more information
# Calculate variable importance with importance()
RF1_imp <- importance(RF_modelC3, type = 1) # select type 1 for Mean decrease accuracy
typeof(RF1_imp)
## [1] "double"
# Convert to a a data table
dt1 <- data.table::as.data.table(RF1_imp, keep.rownames = "var") %>% 
  .[order(-MeanDecreaseAccuracy)]
dt1
##      var MeanDecreaseAccuracy
## 1:   NIR            120.66887
## 2:  blue            112.57748
## 3:   red             71.93014
## 4: green             57.23620

# Plot the graph
Graph3a <- ggplot(data=dt1, aes(x= reorder(var, MeanDecreaseAccuracy), y = MeanDecreaseAccuracy)) +
  labs(title="RFC\n(spectral)", 
       x="", y = "") +
  geom_bar(stat="identity", fill='#f6e8c3', width=0.5) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  coord_flip()

Graph3a

3.1.2. Map of predicted classes

Perform a classification of the image stack using the predict() function.

# Run predict() to store RF predictions
map <- predict(mosaic_C3, RF_modelC3)

# Plot raster
plot(map)

freq(map)
##      value  count
## [1,]     1  21461
## [2,]     2 454440
## [3,]     3  61075
## [4,]     4 583038
## [5,]     5 149557
## [6,]     6 193753
## [7,]    NA 605928

# Uncomment to write classification to disk.
# writeRaster(map, filename="predicted_map", datatype="INT1S", overwrite=T)

3.1.3. Model Performance

Area Adjusted Accuracy assessment calculated as in http://reddcr.go.cr/sites/default/files/centro-de-documentacion/olofsson_et_al._2014_-_good_practices_for_estimating_area_and_assessing_accuracy_of_land_change.pdf

Code source: https://blogs.fu-berlin.de/reseda/area-adjusted-accuracies/

# Extract the predictions from the model, using the test dataset
predClass <- predict(RF_modelC3, test)

# Use as.factor() for conversion of the classID column in the test dataset
test_acc <- as.factor(test$classID) 

# Build a contingency table of the counts at each combination of factor levels.
# Rename the vectors accordingly.
confmat <- table("pred" = predClass, "ref" = test_acc)

# get number of pixels per class and convert in km²
imgVal <- as.factor(getValues(map))
# Remove NA values from the classified dataset 
imgVal <- na.omit(imgVal)
# Extract the number of classes
nclass <- length(unique(train$classID))
# Calculate the total area per class
maparea <- sapply(1:nclass, function(x) sum(imgVal == x))
# Transform area in km2
maparea <- maparea * res(map)[1] ^ 2 / 1000000

# total  map area
A <- sum(maparea)
# proportion of area mapped as class i
W_i <- maparea / A
# number of reference points per class
n_i <- rowSums(confmat) 
# population error matrix
p <- W_i * confmat / n_i
p[is.na(p)] <- 0

# area estimation
p_area <- colSums(p) * A
# overall accuracy
OA <- sum(diag(p))
# producers accuracy
PA <- diag(p) / colSums(p)
# users accuracy
UA <- diag(p) / rowSums(p)

# gather results
result <- matrix(c(p_area, PA * 100, UA * 100, c(OA * 100, rep(NA, nclass-1))), nrow = nclass)
result <- round(result, digits = 2) 
rownames(result) <- levels(as.factor(train$classID))
colnames(result) <- c("km²","PA", "UA", "OA")
class(result) <- "table"
result
##     km²    PA    UA    OA
## 1  0.96  0.56  2.78 44.32
## 2  3.63 40.60 36.02      
## 3  0.80 30.74 44.71      
## 4  4.65 60.30 53.47      
## 5  1.43 48.95 52.00      
## 6  1.70 35.72 34.84

3.2. Model 2: Spectral + Texture features.

3.2.1. GLCM - Gray level co-occurrence matrix

Now, we will add texture features to see if it improves the slum detection.

This code follows the methodology of: https://ieeexplore.ieee.org/document/7447704

Code Source: https://zia207.github.io/geospatial-r-github.io/texture-analysis.html#texture-analysis.

We’ll calculate the GLCM for a 5 x 5 kernel or window size. To calculate for different kernel (15x15, 21x21,…), change the “window” parameter.

# Calculate over all directions
# Red band
texturesRed <- glcm(raster(mosaic_C3, layer=1), 
                         window = c(5,5), 
                         statistics = "variance",
                         shift=list(c(0,1), c(1,1), c(1,0), c(1,-1)))
plot(texturesRed)


# Green band
texturesGreen <- glcm(raster(mosaic_C3, layer=2), 
                         window = c(5,5), 
                         statistics = "variance",
                         shift=list(c(0,1), c(1,1), c(1,0), c(1,-1)))

plot(texturesGreen)


# Blue band
texturesBlue <- glcm(raster(mosaic_C3, layer=3), 
                          window = c(5,5), 
                          statistics = "variance",
                          shift=list(c(0,1), c(1,1), c(1,0), c(1,-1)))
plot(texturesBlue)


# NIR band
texturesNIR <- glcm(raster(mosaic_C3, layer=4), 
                          window = c(5,5), 
                          statistics = "variance",
                          shift=list(c(0,1), c(1,1), c(1,0), c(1,-1)))
plot(texturesNIR)

3.2.2. Principal Component Analysis (PCA) of Texture Bands

To decrease redundancy of texture bands and to determine the appropriate texture features, we will apply PCA to all texture images.

The function rasterPCA() will calculate the PCA of our raster stack and will return a raster brick with multiple layers of PCA scores.

Source code: https://zia207.github.io/geospatial-r-github.io/texture-analysis.html#texture-analysis

# Stack the glcm layers of all bands
mosaic_C3glcm <- stack(texturesRed, texturesGreen, texturesBlue, texturesNIR)

# Develop a PCA model
mosaic_C3glcm2 <- scale(mosaic_C3glcm)        # scale the data
mosaic_C3glcm2[is.na(mosaic_C3glcm2)] <- 0  # define zero  all missing values
mosaic_C3PCA <- rasterPCA(mosaic_C3glcm2, nComp=4)
summary(mosaic_C3PCA$model)
## Importance of components:
##                          Comp.1    Comp.2     Comp.3      Comp.4
## Standard deviation     1.494540 0.7157898 0.18889005 0.124274994
## Proportion of Variance 0.798551 0.1831718 0.01275574 0.005521476
## Cumulative Proportion  0.798551 0.9817228 0.99447852 1.000000000

# Extract first 2 PCs from the model
# Since, the first two PCs account for more of the variability of these textures (see standard deviation), we will extract these 2 components.
# The values from these raster layers will be used as variables for our classification.
PCA1 <- mosaic_C3PCA$map$PC1
PCA2 <- mosaic_C3PCA$map$PC2

# Stack into one raster.
PC_glcm <- stack(PCA1, PCA2)

# Stack with the original mosaic
mosaic_C3ST <- stack(mosaic_C3, PC_glcm)

3.2.4. Training data

# Extract image values at training point locations
trainC3.spGLCM <- raster::extract(PC_glcm, trainC3, sp=T)

# Convert to data.frame 
trainC3.dfGLCM <- as.data.frame(trainC3.spGLCM)

# Merge spectral and glcm values
trainC3.dfST <- merge(trainC3.df,trainC3.dfGLCM, by = "id")

# Rename column classID.x as classID
names(trainC3.dfST)[2] <- 'classID'

# Use as.factor() for conversion of the classID column.
trainC3.dfST$classID <- as.factor(trainC3.dfST$classID) 
str(trainC3.dfST) # allows you to see the classes of the variables (all numeric)
## 'data.frame':    11733 obs. of  17 variables:
##  $ id          : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ classID     : Factor w/ 6 levels "1","2","3","4",..: 4 5 2 4 4 4 4 4 4 4 ...
##  $ class_name.x: chr  "formal_settlement" "non_settlement" "informal_settlement_type2" "formal_settlement" ...
##  $ circ_num.x  : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ blue        : int  547 1192 761 1955 773 1489 1405 989 1383 541 ...
##  $ green       : int  632 1396 839 2161 858 1534 1472 1137 1507 618 ...
##  $ red         : int  523 1638 833 2198 810 1457 1482 1152 1369 529 ...
##  $ NIR         : int  965 2367 1188 2655 1318 1875 1787 1750 1699 1091 ...
##  $ coords.x1.x : num  404857 404829 403756 403969 403935 ...
##  $ coords.x2.x : num  2045945 2045801 2046283 2046086 2046240 ...
##  $ classID.y   : num  4 5 2 4 4 4 4 4 4 4 ...
##  $ class_name.y: chr  "formal_settlement" "non_settlement" "informal_settlement_type2" "formal_settlement" ...
##  $ circ_num.y  : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ PC1         : num  -0.817 2.798 -0.401 5.3 -0.318 ...
##  $ PC2         : num  0.0432 -2.8057 0.2067 -2.6092 0.2352 ...
##  $ coords.x1.y : num  404857 404829 403756 403969 403935 ...
##  $ coords.x2.y : num  2045945 2045801 2046283 2046086 2046240 ...

# Include only useful predictors in the model.
trainC3_dfST <- select(trainC3.dfST, -c("id", "class_name.x","circ_num.x","coords.x1.x","coords.x2.x",
                                    "classID.y","class_name.y","circ_num.y","coords.x1.y","coords.x2.y")) 
# The RF algorithm cannot deal with NoData (NA) values. Remove NAs from the data.frame.
sum(is.na(trainC3_dfST)) # check how many null values there are
## [1] 0
trainC3_dfST <- na.omit(trainC3_dfST)
sum(is.na(trainC3_dfST))
## [1] 0

# Set a portion of the data aside for testing
sampleST <- createDataPartition(trainC3_dfST$classID, p = .80, list = FALSE)
trainST <- trainC3_dfST[sampleST,]
testST <- trainC3_dfST[-sampleST,]

# Check how many data points are per class
trainST %>% group_by(classID) %>% count()
## # A tibble: 6 x 2
## # Groups:   classID [6]
##   classID     n
##   <fct>   <int>
## 1 1         708
## 2 2        2634
## 3 3         502
## 4 4        3556
## 5 5         857
## 6 6        1132
testST %>% group_by(classID) %>% count()
## # A tibble: 6 x 2
## # Groups:   classID [6]
##   classID     n
##   <fct>   <int>
## 1 1         177
## 2 2         658
## 3 3         125
## 4 4         888
## 5 5         214
## 6 6         282

3.2.5. Random Forest Model (Spectral + Texture)

# Define accuracy from 5-fold cross-validation as optimization measure
cvST <- tune.control(cross = 5) 

# Use tune.randomForest to assess the optimal combination of ntree and mtry
RF_modelC3ST.tune <- tune.randomForest(classID~., 
                                     data = trainST, 
                                     ntree=750, 
                                     mtry=c(2:10), 
                                     importance = TRUE,
                                     tunecontrol = cvST)

# Store the best model in a new object for further use
RF_modelC3ST <- RF_modelC3ST.tune$best.model
print(RF_modelC3ST)
## 
## Call:
##  best.randomForest(x = classID ~ ., data = trainST, mtry = c(2:10),      ntree = 750, importance = TRUE, tunecontrol = cvST) 
##                Type of random forest: classification
##                      Number of trees: 750
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 51.87%
## Confusion matrix:
##    1    2   3    4   5   6 class.error
## 1 15  247   1  383  15  47   0.9788136
## 2 28 1047  77 1155  89 238   0.6025057
## 3  1  151 143   79 106  22   0.7151394
## 4 30  798  13 2454  52 209   0.3098988
## 5  1  130  73  143 452  58   0.4725788
## 6 13  287  12  361  51 408   0.6395760

3.2.6. Variable Importance

# Calculate variable importance with importance()
RF2_imp <- importance(RF_modelC3ST, type = 1) # select type 1 for Mean decrease accuracy
typeof(RF2_imp)
## [1] "double"
# Convert to a a data table
dt2 <- data.table::as.data.table(RF2_imp, keep.rownames = "var") %>% 
  .[order(-MeanDecreaseAccuracy)] %>% 
  .[var == "PC1", var := "GLCM Red"] %>% # Rename GLCM variables 
  .[var == "PC2", var := "GLCM Green"]
  

# Plot the graph
Graph3b <- ggplot(data=dt2, aes(x= reorder(var, MeanDecreaseAccuracy), y = MeanDecreaseAccuracy)) +
  labs(title="RFC\n(spectral + texture)\n5 x 5", 
       x="", y = "") +
  geom_bar(stat="identity", fill='#f6e8c3', width=0.5) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  coord_flip()

Graph3b 

3.2.7. Map of predicted classes

Perform a classification of the image stack using the predict() function.

# Run predict() to store RF predictions
mapST <- predict(mosaic_C3ST, RF_modelC3ST)

# Plot raster
plot(mapST)

freq(mapST)
##      value  count
## [1,]     1  10615
## [2,]     2 447154
## [3,]     3  45705
## [4,]     4 605586
## [5,]     5 167633
## [6,]     6 186631
## [7,]    NA 605928

# Uncomment to write classification to disk.
# writeRaster(mapST, filename="predicted_mapST", datatype="INT1S", overwrite=T)

3.2.8. Model Performance

# Extract the predictions from the model, using the test dataset
predClassST <- predict(RF_modelC3ST, testST)

# Use as.factor() for conversion of the classID column in the test dataset
test_accST <- as.factor(testST$classID) 

# Build a contingency table of the counts at each combination of factor levels.
# Rename the vectors accordingly.
confmat_ST <- table("pred" = predClassST, "ref" = test_accST)

# get number of pixels per class and convert in km²
imgVal_ST <- as.factor(getValues(mapST))
# Remove NA values from the classified dataset 
imgVal_ST <- na.omit(imgVal_ST)
# Extract the number of classes
nclass_ST <- length(unique(trainST$classID))
# Calculate the total area per class
maparea_ST <- sapply(1:nclass_ST, function(x) sum(imgVal_ST == x))
# Transform area in km2
maparea_ST <- maparea_ST * res(mapST)[1] ^ 2 / 1000000

# total  map area
A_ST <- sum(maparea_ST)
# proportion of area mapped as class i
W_i_ST <- maparea_ST / A_ST
# number of reference points per class
n_i_ST <- rowSums(confmat_ST) 
# population error matrix 
p_ST <- W_i_ST * confmat_ST / n_i_ST
p_ST[is.na(p_ST)] <- 0

# area estimation
p_area_ST <- colSums(p_ST) * A_ST
# overall accuracy 
OA_ST <- sum(diag(p_ST))
# producers accuracy 
PA_ST <- diag(p_ST) / colSums(p_ST)
# users accuracy 
UA_ST <- diag(p_ST) / rowSums(p_ST)


# gather results
result_ST <- matrix(c(p_area_ST, PA_ST * 100, UA_ST * 100, c(OA_ST * 100, rep(NA, nclass_ST-1))), nrow = nclass_ST)
result_ST <- round(result_ST, digits = 2) 
rownames(result_ST) <- levels(as.factor(trainST$classID))
colnames(result_ST) <- c("km²", "PA", "UA", "OA")
class(result_ST) <- "table"
result_ST
##     km²    PA    UA    OA
## 1  0.94  1.61 15.79 47.75
## 2  3.68 43.19 39.50      
## 3  0.75 25.59 46.91      
## 4  4.58 64.81 54.40      
## 5  1.49 59.87 59.02      
## 6  1.74 36.61 37.84

4. Data Visualisation

Let’s plot some maps with the resulting classification

4.1. Training data

First, let’s plot the training data over the PlanetScope mosaic.


# Set the map's properties
Map1 <- ggplot() +
  ggRGB(mosaic_C3,
        r = 3,
        g = 2,
        b = 1,
        stretch = "lin",
        ggLayer = TRUE) +
  geom_sf(data = trainC3,
          size = 0.4,
           aes(color = factor(classID))) +
  scale_color_manual(name = 'Classes',
                     values = c('#8c510a', '#d8b365', '#f6e8c3','#c7eae5','#5ab4ac','#01665e'),
                     labels = c('Informal Type I','Informal Type II', 'Informal Type III',
                                'Formal', 'No-settlement', 'Roads and asphalt')) +
  theme(legend.title = element_text(size = 9),
        legend.text = element_text(size = 6),
        legend.key = element_rect(fill = NA),
        legend.position= c(1.2, 0.62),
        axis.ticks = element_line(colour = "grey70", size = 0.2),
        panel.grid.major = element_line(colour = "grey70", size = 0.2),
        panel.background = element_blank()) +
  labs(x="", 
       y="") +
  ggsn::scalebar(data = trainC3,
                 dist = 0.5, 
                 dist_unit = "km",
                 transform = FALSE,
                 height = 0.01,
                 border.size = 0.5,
                 location = "bottomleft",
                 st.dist = 0.05) +
  north(data=trainC3,
        location= "topright",
        symbol = 10) 

Map1


# Uncomment to Save the map
# ggsave("Map1.png",
#        plot = Map1,
#        dpi = 600)

4.2. Classification map

Map with all predicted classes.

Model 1. Spectral
# Map 3 - Classification maps
  
  # Map classification #1
  # convert raster in data frame
  mapVal <-  rasterToPoints(map)    
mapdf <- data.frame(mapVal)
colnames(mapdf) <- c('Longitude', 'Latitude', 'classID')


# Set the map's properties
Map3_1 <- ggplot() +
  geom_raster(data = mapdf,
              aes(y=Latitude,
                  x=Longitude,
                  fill = factor(classID))) +
  coord_equal() +
  scale_fill_manual(name = 'Classes',
                    values = c('#8c510a', '#d8b365', '#f6e8c3','#c7eae5','#5ab4ac','#01665e'),
                    labels = c('Informal Type I','Informal Type II', 'Informal Type III',
                               'Formal', 'No-settlement', 'Roads and asphalt')) +
  theme(legend.title = element_text(size = 15),
        legend.text = element_text(size = 12),
        legend.key = element_rect(fill = NA),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        panel.grid.major = element_blank(),
        panel.background = element_blank()) +
  labs(title="Predicted Classification Map. RFC Spectral",
       x="", 
       y="") +
  ggsn::scalebar(data = trainC3,
                 dist = 0.5, 
                 dist_unit = "km",
                 transform = FALSE,
                 height = 0.02,
                 border.size = 0.5,
                 location = "bottomright",
                 st.dist = 0.05) +
  north(data=trainC3,
        location= "topright",
        symbol = 10) 
Map3_1

Model 2. Spectral + Texture
# Map classification #2
# convert raster in data frame 
mapValST <-  rasterToPoints(mapST)    
mapdfST <- data.frame(mapValST)
colnames(mapdfST) <- c('Longitude', 'Latitude', 'classID')

# Set the map's properties
Map3_2 <- ggplot() +
  geom_raster(data = mapdfST,
              aes(y=Latitude,
                  x=Longitude,
                  fill = factor(classID))) +
  coord_equal() +
  scale_fill_manual(name = 'Classes',
                    values = c('#8c510a', '#d8b365', '#f6e8c3','#c7eae5','#5ab4ac','#01665e'),
                    labels = c('Informal Type I','Informal Type II', 'Informal Type III',
                               'Formal', 'No-settlement', 'Roads and asphalt')) +
  theme(legend.title = element_text(size = 15),
        legend.text = element_text(size = 12),
        legend.key = element_rect(fill = NA),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        panel.grid.major = element_blank(),
        panel.background = element_blank()) +
  labs(title="Predicted Classification Map. RFC Spectral + Texture",
       x="", 
       y="") +
  ggsn::scalebar(data = trainC3,
                 dist = 0.5, 
                 dist_unit = "km",
                 transform = FALSE,
                 height = 0.01,
                 border.size = 0.5,
                 location = "bottomright",
                 st.dist = 0.05) +
  north(data=trainC3,
        location= "topright",
        symbol = 10) 
Map3_2

4.3. Informal settlements map

Map with all predicted slum classes.

Model 1. Spectral
# Map 4 - Slum maps (by type)

# Map slums #1
# Subset dataframe 
mapdf_slums <- filter(mapdf, classID == '1' | classID == '2' | classID == '3')

# Set the map's properties
Map4_1 <- ggplot() +
  ggRGB(mosaic_C3,
        r = 3,
        g = 2,
        b = 1,
        stretch = "lin",
        ggLayer = TRUE) +
  geom_tile(data = mapdf_slums,
            aes(y=Latitude,
                x=Longitude,
                fill = factor(classID))) +
  coord_equal() +
  scale_fill_manual(name = 'Classes',
                    values = c('#8c510a', '#d8b365', '#f6e8c3'),
                    labels = c('Informal Type I','Informal Type II', 'Informal Type III')) +
  theme(legend.title = element_text(size = 15),
        legend.text = element_text(size = 12),
        legend.key = element_rect(fill = NA),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        panel.grid.major = element_blank(),
        panel.background = element_blank()) +
  labs(title="Predicted Informal Settlements Classification Map. RFC Spectral", 
       x="", 
       y="") +
  ggsn::scalebar(data = trainC3,
                 dist = 0.5, 
                 dist_unit = "km",
                 transform = FALSE,
                 height = 0.01,
                 border.size = 0.5,
                 location = "bottomright",
                 st.dist = 0.05) +
  north(data=trainC3,
        location= "topright",
        symbol = 10)  
Map4_1

Model 2. Spectral + Texture
# Map slums #2
# Subset dataframe 
mapdfST_slums <- filter(mapdfST, classID == '1' | classID == '2' | classID == '3')

# Set the map's properties
Map4_2 <- ggplot() +
  ggRGB(mosaic_C3,
        r = 3,
        g = 2,
        b = 1,
        stretch = "lin",
        ggLayer = TRUE) +
  geom_tile(data = mapdfST_slums,
            aes(y=Latitude,
                x=Longitude,
                fill = factor(classID))) +
  coord_equal() +
  scale_fill_manual(name = 'Classes',
                    values = c('#8c510a', '#d8b365', '#f6e8c3'),
                    labels = c('Informal Type I','Informal Type II', 'Informal Type III')) +
  theme(legend.title = element_text(size = 15),
        legend.text = element_text(size = 12),
        legend.key = element_rect(fill = NA),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        panel.grid.major = element_blank(),
        panel.background = element_blank()) +
  labs(title="Predicted Informal Settlements Classification Map. RFC Spectral + Texture",
       x="", 
       y="") +
  ggsn::scalebar(data = trainC3,
                 dist = 0.5, 
                 dist_unit = "km",
                 transform = FALSE,
                 height = 0.01,
                 border.size = 0.5,
                 location = "bottomright",
                 st.dist = 0.05) +
  north(data=trainC3,
        location= "topright",
        symbol = 10) 

Map4_2